Introduction

This report analyzes proteomics data. The analysis includes:

Data loading and quality control

Protein and peptide quantification

Statistical analysis

Visualization of results

library(ggplot2)
library(dplyr)

custom_theme <- function() {
  theme_bw(base_size = 10) +
    theme(
      # Vertical x-axis labels (applies to ALL plots)
      axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
      
      # Unified text sizes
      axis.text = element_text(size = 10),
      axis.title = element_text(size = 12),
      plot.title = element_text(size = 14, face = "bold"),
      legend.text = element_text(size = 11),
      legend.title = element_text(size = 12),
      
      # Other consistent styling
      legend.position = "bottom",
      panel.grid.major.y = element_line(color = "gray90")
    )
}

# Make this theme apply to all plots automatically
theme_set(custom_theme())

Load All Required Libraries

library(tidyverse)
library(ggrepel)
library(DEP)
library(missForest)
library(pheatmap)
library(openxlsx)
library(RColorBrewer)
library(ggpubr)
library(dichromat)
library(factoextra)
library(ggplotify)
library(OpenImageR)
library(svDialogs)
library(raster)
library(viridis)
library(patchwork)
library(here)
if (!requireNamespace("preprocessCore", quietly = TRUE)) {
  install.packages("preprocessCore")
}
library(preprocessCore)

Data Loading and Preparation

# Set working directory to script location
sampleTypeDir <- file.path(dirname(rstudioapi::getSourceEditorContext()$path))
setwd(sampleTypeDir)
default_dir <- paste0(sampleTypeDir, "/*.*")

# Load custom functions
source("functions_ver2.3.R")

# Select input files interactively
protein_file_name <- choose.files(
  default = default_dir, 
  caption = "Please, select proteins file", 
  multi = FALSE
)
file_loc <- stringr::str_replace(protein_file_name, pattern = "_Proteins.txt", replacement = "")

sample_list_file_name <- choose.files(
  default = default_dir, 
  caption = "Please, select sample list file", 
  multi = FALSE
)

# Define peptides file
peptides_file_name <- paste0(file_loc, "_PeptideGroups.txt")

Extract Project Name

projectName <- t(data.frame(strsplit(protein_file_name, split = "\\\\")))
projectName <- projectName[,ncol(projectName)]
projectName <- stringr::str_replace(projectName, "_Proteins.txt", replacement = "")

Create Output Directory

Data Loading and Initial Processing

# Load protein data
proteinGroups <- read.delim(protein_file_name, check.names = FALSE)
peptides <- read.delim(peptides_file_name, check.names = FALSE)
# Load sample list
sample_list <- read.delim(sample_list_file_name, check.names = FALSE)


group_levels <- unique(sample_list$Group)

# If more than 2 groups, define which to use
if (length(group_levels) != 2) {
  # Specify the two groups you want to analyze
  selected_groups <- c("Prototype", "MPSP")

  # Check they exist in the data
  if (!all(selected_groups %in% group_levels)) {
    stop("Selected groups not found in the sample list. Available groups: ", paste(group_levels, collapse = ", "))
  }

  # Filter to only selected groups
  sample_list <- sample_list %>% filter(Group %in% selected_groups)
  group_levels <- selected_groups
}

# Assign groups
GroupA <- group_levels[1]
GroupB <- group_levels[2]
ratio_name <- paste0(GroupA, "/", GroupB)

# Final filtering to ensure only the two groups are used
sample_list <- sample_list %>% filter(Group %in% c(GroupA, GroupB))

## Automatically detect group order
ordered_groups <- unique(sample_list$Group)

# Automatically detect the 2 group names from sample_list
ordered_groups <- sort(unique(sample_list$Group))  # Alphabetical or custom logic

# Color Palette
group_names <- unique(sample_list$Group)
#extended_palette <- brewer.pal(n = max(3, length(group_names)), name = "Paired")
extendet_pallete <- colorRampPalette(brewer.pal(12, name = "Paired"))(length(group_names))
group_colors <- setNames(extendet_pallete[1:length(group_names)], group_names)

Set Filtering Criteria

no_peptides <- 2
is_master_protein <- TRUE

Prepare Protein Data

### Prepare Protein Data

# Select columns for protein file 
list_of_columns_proteins <- c(
  "Number", 
  "Accession", 
  "Description", 
  "Master", 
  "Coverage [%]",
  "# Peptides", 
  "# PSMs", 
  "# Unique Peptides", 
  "MW [kDa]", 
  "Contaminant",
  "# Protein Groups", 
  "Sequence"
)

# Verify all requested columns exist
missing_cols <- setdiff(list_of_columns_proteins, names(proteinGroups))
if(length(missing_cols) > 0) {
  warning(paste("The following columns are missing:", paste(missing_cols, collapse=", ")))
}

# Select the identification columns
proteinGroups_identification <- proteinGroups %>%
  dplyr::select(any_of(list_of_columns_proteins), starts_with("Found in Sample:"))

# Select abundance columns - note your actual columns start with "Abundance:" not "Abundances:"
proteinGroups_abundances <- proteinGroups %>%
  dplyr::select(starts_with("Abundance:"))

# Handle empty values
proteinGroups_abundances[proteinGroups_abundances == ''] <- NA

# Select only samples from sample list
proteinGroups_abundances <- proteinGroups_abundances %>%
  dplyr::select(all_of(sample_list$RawFile))

# Update sample names
new_sample_names <- sample_list$Sample
colnames(proteinGroups_abundances) <- new_sample_names

# Merge identification and abundance data
proteinGroups_merged <- cbind(proteinGroups_identification, proteinGroups_abundances)

Data Filtering

# Apply filtering criteria
filtered <- proteinGroups_merged

if(is_master_protein == TRUE){
  filtered1 <- filtered[filtered$Master == "IsMasterProtein",]
} else {
  filtered1 <- filtered
}

filtered2 <- filtered1[filtered1$`# Peptides` >= no_peptides,]
filtered3 <- filtered2[filtered2$Contaminant != "True",]

proteinGroups_filtered <- filtered3
# Filter Peptides
peptides_filtered <- peptides %>%
  dplyr::select(Sequence, starts_with("Abundance:"), "XCorr (by Search Engine): Sequest HT") %>%
  filter(`XCorr (by Search Engine): Sequest HT` >= 1.5) %>%
  dplyr::select(Sequence, starts_with("Abundance:"))

# Rename columns to match sample names
colnames(peptides_filtered) <- c("Sequence", sample_list$Sample)

Data Quality Assessment

Protein and Peptide Quantification

# Prepare extended color palette
extendet_pallete <- colorRampPalette(brewer.pal(8, name = "Dark2"))(
  length(unique(sample_list$Group))
)

# Protein quantification
proteinGroups_filtered_quality <- proteinGroups_filtered %>%
  dplyr::select(all_of(sample_list$Sample))

quantified_proteins <- data.frame(colSums(!is.na(proteinGroups_filtered_quality)))
quantified_proteins <- rownames_to_column(quantified_proteins)
colnames(quantified_proteins) <- c("Sample", "Quantified_Proteins")
quantified_proteins$Group <- sample_list$Group

Visualization of Quantified Proteins

Click to expand: Quantified Proteins
## Number of Quantified Proteins
quantified_proteins$Group <- factor(quantified_proteins$Group, levels = ordered_groups)

if(nrow(sample_list) > 20){
  plot_quant_proteins <- ggplot(quantified_proteins, aes(x = Sample, y = Quantified_Proteins))+
    geom_point(aes(color = Group), size = 2)+
    geom_segment(aes(x = Sample, xend = Sample, y = 0, yend = Quantified_Proptides, color = Group))+
    theme_classic()+
    labs(y = "Number of filtered and quantified proteins", x = NULL)+
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
    scale_color_manual(values = group_colors)+
    geom_text(aes(label=Quantified_Proteins), vjust= -1.0, hjust = 0.5, angle = 90, 
              color="black", size=4)
} else {
  plot_quant_proteins <- ggplot(quantified_proteins, 
                               aes(x = Sample, y = Quantified_Proteins, fill = Group))+
    geom_bar(stat = "identity", position = "dodge")+
    geom_text(aes(label=Quantified_Proteins), vjust=0.5, hjust = 1.5, angle = 90, 
              color="white", size=4)+
    theme_classic()+
    labs(y = "Number of filtered and quantified proteins", x = NULL)+
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
    scale_fill_manual(values = group_colors)  
}

plot_quant_proteins

# Save both PNG and SVG
ggsave(
  filename = "Number of filtered and quantified proteins.png",
  plot     = plot_quant_proteins,
  path     = export_dir,
  width    = 6,
  height   = 5,
  units    = "in",
  dpi      = 300
)
ggsave(
  filename = "Number of filtered and quantified proteins.svg",
  plot     = plot_quant_proteins,
  path     = export_dir,
  width    = 6,
  height   = 5,
  units    = "in",
  device   = "svg"
)

Peptide Quantification

peptides_filtered_quality <- peptides_filtered %>%
  dplyr::select(all_of(sample_list$Sample))

quantified_peptides <- data.frame(colSums(!is.na(peptides_filtered_quality)))
quantified_peptides <- rownames_to_column(quantified_peptides)
colnames(quantified_peptides) <- c("Sample", "quantified_peptides")
quantified_peptides$Group <- sample_list$Group
quantified_peptides$Group <- factor(quantified_peptides$Group, levels = unique(quantified_peptides$Group))

Visualization of Quantified Peptides

Click to expand: Quantified Peptides
# Ensure group ordering
quantified_peptides$Group <- factor(quantified_proteins$Group, levels = ordered_groups)
plot_quant_peptides <- ggplot(quantified_peptides, aes(x = Sample, y = quantified_peptides, fill = Group))+
  geom_bar(stat = "identity", position = "dodge")+
  geom_text(aes(label=quantified_peptides), vjust=0.5, hjust = 1.5, angle = 90, color="white",
            position = position_dodge(0.9), size=4)+
  theme_classic()+
  labs(y = "Number of filtered and quantified peptides", x = NULL)+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
  scale_fill_manual(values = group_colors)

plot_quant_peptides

# Save both PNG and SVG
ggsave(
  filename = "Number of filtered and quantified peptides.png",
  plot     = plot_quant_peptides,
  path     = export_dir,
  width    = 6,
  height   = 5,
  units    = "in",
  dpi      = 300
)
ggsave(
  filename = "Number of filtered and quantified peptides.svg",
  plot     = plot_quant_peptides,
  path     = export_dir,
  width    = 6,
  height   = 5,
  units    = "in",
  device   = "svg"
)

QC – Average Quantified Peptides and Proteins

Click to expand: Average Quantified Peptides and Proteins
# Create export folder if it doesn’t exist
export_dir <- "exported_images"
if (!dir.exists(export_dir)) {
  dir.create(export_dir)
}

# Peptides

quantified_peptides$Group <- factor(quantified_proteins$Group, levels = ordered_groups)
quant_peptides_average <- quantified_peptides %>%
  dplyr::group_by(Group) %>%
  dplyr::summarise(Mean = mean(quantified_peptides), SD = sd(quantified_peptides))

quant_peptides_average$Group <- factor(quant_peptides_average$Group, levels = unique(quant_peptides_average$Group))

p_peptides <- ggplot(quant_peptides_average, aes(x=Group, y=Mean, fill=Group)) + 
  geom_bar(stat="identity", color="black", position=position_dodge()) +
  geom_errorbar(aes(ymin=Mean-SD, ymax=Mean+SD), width=.2, position=position_dodge(.9)) +
  labs(y = "Average number of quantified peptides") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90))+
  scale_fill_manual(values = group_colors)

p_peptides

# Save both PNG and SVG
ggsave(
  filename = "Average number of quantified peptides.png",
  plot     = p_peptides,
  path     = export_dir,
  width    = 6,
  height   = 5,
  units    = "in",
  dpi      = 300
)
ggsave(
  filename = "Average number of quantified peptides.svg",
  plot     = p_peptides,
  path     = export_dir,
  width    = 6,
  height   = 5,
  units    = "in",
  device   = "svg"
)

#Proteins

quantified_proteins$Group <- factor(quantified_proteins$Group, levels = ordered_groups)
quant_proteins_average <- quantified_proteins %>%
  dplyr::group_by(Group) %>%
  dplyr::summarise(Mean = mean(Quantified_Proteins), 
                   SD = sd(Quantified_Proteins))

quant_proteins_average$Group <- factor(quant_proteins_average$Group, 
                                     levels = unique(quant_proteins_average$Group))

p_proteins <- ggplot(quant_proteins_average, aes(x=Group, y=Mean, fill=Group)) + 
  geom_bar(stat="identity", color="black", position=position_dodge(), width=0.7) +
  geom_bar(stat="identity", color="black", position=position_dodge()) +
  geom_errorbar(aes(ymin=Mean-SD, ymax=Mean+SD), width=.2, position=position_dodge(.9)) +
  labs(y = "Average number of quantified proteins") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90))+
  scale_fill_manual(values = group_colors)

p_proteins

# Save both PNG and SVG
ggsave(
  filename = "Average number of quantified proteins.png",
  plot     = p_proteins,
  path     = export_dir,
  width    = 6,
  height   = 5,
  units    = "in",
  dpi      = 300
)
ggsave(
  filename = "Average number of quantified proteins.svg",
  plot     = p_proteins,
  path     = export_dir,
  width    = 6,
  height   = 5,
  units    = "in",
  device   = "svg"
)

Re-order columns

# Create Number column if it doesn't exist in BOTH data frames
if(!"Number" %in% names(proteinGroups_merged)) {
  proteinGroups_merged$Number <- seq_len(nrow(proteinGroups_merged))
}

if(!"Number" %in% names(proteinGroups_filtered)) {
  proteinGroups_filtered$Number <- seq_len(nrow(proteinGroups_filtered))
}

# Define column order
list_of_columns_proteins_order <- c(
  "Number", "Accession", "Description", "MW [kDa]",
  colnames(proteinGroups_abundances),
  "# Peptides", "# PSMs", "# Unique Peptides", "Coverage [%]", 
  "Contaminant", "# Protein Groups", "Master", "Sequence"
)

# Verify columns exist in BOTH data frames
existing_cols_merged <- intersect(list_of_columns_proteins_order, names(proteinGroups_merged))
existing_cols_filtered <- intersect(list_of_columns_proteins_order, names(proteinGroups_filtered))

# Get union of columns that exist in either data frame
final_cols <- union(existing_cols_merged, existing_cols_filtered)

# Apply column selection
proteinGroups_merged <- proteinGroups_merged %>%
  dplyr::select(all_of(final_cols))

proteinGroups_filtered <- proteinGroups_filtered %>%
  dplyr::select(all_of(final_cols))

# Warning about any missing columns
missing_cols <- setdiff(list_of_columns_proteins_order, final_cols)
if(length(missing_cols) > 0) {
  warning(paste("The following columns are missing from one or both data frames:",
                paste(missing_cols, collapse=", ")))
}

QC – Violin Plot of Intensities

Click to expand: Violin Plot of Intensities
# Prepare abundance data
qc_abundance <- proteinGroups_abundances %>%
  pivot_longer(cols = everything(), names_to = "Sample", values_to = "Abundance") %>%
  dplyr::mutate(
    Abundance = na_if(Abundance, 0), 
    Abundance_log = log2(Abundance)
  )

# Extract group from sample name
qc_abundance <- qc_abundance %>%
  left_join(sample_list[, c("Sample", "Group")], by = "Sample") %>%
  mutate(Group = factor(Group, levels = ordered_groups))

# Median intensity for reference line
overall_median <- median(qc_abundance$Abundance_log, na.rm = TRUE)

# QC violin per sample
qc_abundance_long <- proteinGroups_abundances %>%
  pivot_longer(cols = everything(), names_to = "Sample", values_to = "Abundance") %>%
  mutate(Abundance_log = log2(na_if(Abundance, 0))) %>%
  left_join(sample_list[, c("Sample", "Group")], by = "Sample")

violin_qc_plot <- ggplot(qc_abundance_long, aes(x = Sample, y = Abundance_log, fill = Group)) +
  geom_violin(trim = FALSE, alpha = 0.7) +
  geom_boxplot(width = 0.1, fill = "white") +
  geom_hline(yintercept = overall_median, color = "red", linetype = "dashed", size = 1) +
  facet_wrap(~ Group, scales = "free_x") +
  labs(title = "Protein Intensity Distribution per Sample",
       x = "Sample", y = "Log2 Intensity") +
  scale_fill_manual(values = group_colors) +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8))

violin_qc_plot

# Save both PNG and SVG
ggsave(
  filename = "violinplot_protein_abundance.png",
  plot     = violin_qc_plot,
  path     = export_dir,
  width    = 6,
  height   = 5,
  units    = "in",
  dpi      = 300
)
ggsave(
  filename = "violinplot_protein_abundance.svg",
  plot     = violin_qc_plot,
  path     = export_dir,
  width    = 6,
  height   = 5,
  units    = "in",
  device   = "svg"
)

Filtering, Normalization and Imputation Normalization Effect Plot

Click to expand: Filtering, Normalization and Imputation
# Step 1: Prepare raw protein data
protein_df <- proteinGroups_filtered %>%
  dplyr::select("Accession", all_of(sample_list$Sample))
rownames(protein_df) <- protein_df$Accession
protein_df <- protein_df[, -1]



# Step 2: Filtering (on raw intensities)
df_filtered <- filter_valids_gpt(
  protein_df,
  sample_list,
  min_count    = 2,
  at_least_one = TRUE
)

# --- Create log-transformed version for plotting only ---
df_raw_log <- log2(as.matrix(df_filtered) + 1)
df_raw_log <- as.data.frame(df_raw_log)
df_raw_log$Type <- "Before Normalization"

# --- Keep raw for normalization ---
df_unlogged <- df_filtered
df_unlogged[df_unlogged == 0] <- NA  # replace 0s with NA before normalization

normalized_list <- list(
  "None"            = normalize_df(df_unlogged, 0),
  "Median"          = normalize_df(df_unlogged, 1),
  "Sum"             = normalize_df(df_unlogged, 2),
  "Batch"           = normalize_df(df_unlogged, 3),
  "Day"             = normalize_df(df_unlogged, 4),
  "Quantile Batch"  = normalize_df(df_unlogged, 5),
  "Quantile"        = normalize_df(df_unlogged, 6)
)

normalized_list <- lapply(normalized_list, function(df) {
  df[] <- lapply(df, function(col) {
    if (is.numeric(col)) log2(col + 1e-6) else col
  })
  as.data.frame(df)
})

# Choose normalization to proceed
df_normalized <- as.data.frame(normalized_list[["Median"]])  # <-- change here if needed
df_normalized$Type   <- "After Normalization"

# Step 5: Combine both for plotting
df_combined <- bind_rows(
  df_raw_log %>%
    rownames_to_column("Protein") %>%
    dplyr::select(-Type) %>%  # Remove Type before pivoting
    pivot_longer(-Protein, names_to = "Sample", values_to = "Intensity") %>%
    mutate(Type = "Before Normalization"),
  
  df_normalized %>%
    rownames_to_column("Protein") %>%
    dplyr::select(-Type) %>%
    pivot_longer(-Protein, names_to = "Sample", values_to = "Intensity") %>%
    mutate(Type = "After Normalization")
)


# Step 6: Merge group info
df_combined <- left_join(df_combined, sample_list[, c("Sample", "Group")], by = "Sample")
df_combined$Type <- factor(df_combined$Type, levels = c("Before Normalization", "After Normalization"))
df_combined$Group <- factor(df_combined$Group, levels = ordered_groups)

# Step 7: Plot the normalization effect
# Compute one median Intensity per “Type” (facet):
median_by_type <- df_combined %>%
  group_by(Type) %>%
  summarize(facet_med = median(Intensity, na.rm = TRUE)) 

# Now build the combined violin + boxplot + median‐line plot:
plot_per_sample_violin <- ggplot(df_combined, aes(x = Sample, y = Intensity, fill = Group)) +
  
  # 1) Draw per‐sample violins:
  geom_violin(
    trim  = FALSE,
    width = 1.0,
    alpha = 0.6,
    color = NA
  ) +
  
  # 2) Draw a narrow boxplot inside each violin:
  geom_boxplot(
    width         = 0.15,
    fill          = "white",
    color         = "black",
    outlier.shape = NA,
    lwd           = 0.3
  ) +
  
  # 3) Add per‐sample median dot (optional; you can remove if you prefer only the horizontal line):
  stat_summary(
    aes(group = Sample),
    fun   = median,
    geom  = "point",
    shape = 21,
    fill  = "white",
    color = "black",
    size  = 1.5
  ) +
  
  # 4) Draw a single dashed horizontal line at each facet’s median:
  geom_hline(
    data = median_by_type,
    aes(yintercept = facet_med),
    color     = "red",
    linetype  = "dashed",
    size      = 0.6
  ) +
  
  # 5) Facet into one column: “Before” on top, “After” below
  facet_wrap(~Type, ncol = 1, strip.position = "top") +
  
  # 6) Color each violin by its Group
  scale_fill_manual(values = group_colors) +
  
  # 7) Labels
  labs(
    title = "Effect of Normalization on Protein Intensity",
    x     = "Sample",
    y     = expression(Log[2] ~ "Intensity")
  ) +
  
  # 8) Theme tweaks for a cleaner look
  theme_classic(base_size = 12) +
  theme(
    # Move the main title closer to the facets:
    plot.title       = element_text(face = "bold", hjust = 0.5, size = 14, margin = margin(b = 8)),
    
    # Style the facet strip (white background, black border):
    strip.background = element_rect(fill = "white", color = "black", size = 0.4),
    strip.text       = element_text(face = "bold", size = 11),
    
    # Rotate sample names 90° under each violin, shrink font:
    axis.text.x      = element_text(angle = 90, hjust = 1, vjust = 0.4, size = 9),
    axis.text.y      = element_text(size = 10),
    axis.title.x     = element_text(size = 11, margin = margin(t = 6)),
    axis.title.y     = element_text(size = 11, margin = margin(r = 6)),
    
    # Legend on the right (Groups colored consistently)
    legend.position  = "right",
    legend.title     = element_text(size = 11),
    legend.text      = element_text(size = 9),
    
    # Reduce vertical spacing between facets so they sit closer:
    panel.spacing.y  = unit(0.5, "lines")
  )

# 9) Print the plot to the Rmd
print(plot_per_sample_violin)

# Step 8: Imputation
imputation_mode <- 2  # 1 = MNAR only; 2 = MNAR + missForest

if (imputation_mode == 1) {
  # Single-step: group-wise MNAR only
  suppressMessages({
    df_norm_valid_imp <- group_MNAR_impute_data(df_normalized, frequency_threshold = 2)
  })

} else if (imputation_mode == 2) {
  # Two-step: MNAR group-wise, then missForest
  suppressMessages({
    step1 <- group_MNAR_impute_data(df_normalized, frequency_threshold = 2) %>%
  dplyr::select(!contains("imputed")) %>%
  dplyr::select(where(is.numeric))  # removes Type safely
})

  # Run missForest quietly
  suppressMessages({
    missforest_result <- missForest(as.matrix(step1), verbose = FALSE)
  })

  # Add imputation mask for later QC
  missing_mask <- is.na(as.matrix(step1))
  colnames(missing_mask) <- paste0("imputed_", colnames(step1))

  df_norm_valid_imp <- as.data.frame(missforest_result$ximp)
  df_norm_valid_imp <- cbind(df_norm_valid_imp, missing_mask)
}


# Save both PNG and SVG
ggsave(
  filename = "Effect of Normalization on Protein Intensity.png",
  plot     = plot_per_sample_violin,
  path     = export_dir,
  width    = 6,
  height   = 5,
  units    = "in",
  dpi      = 300
)
ggsave(
  filename = "Effect of Normalization on Protein Intensity.svg",
  plot     = plot_per_sample_violin,
  path     = export_dir,
  width    = 6,
  height   = 5,
  units    = "in",
  device   = "svg"
)

Effect of Imputation on Protein Intensity Distributions

Click to expand: Imputation Effect per Sample
# Prepare long-format data using the helper function (already in your functions_ver2.5.R)

df_long_imputed <- transform_imputed_to_long(df_norm_valid_imp)

# Relabel imputation source for clarity
df_long_imputed <- df_long_imputed %>%
  mutate(
    Type = ifelse(Source, "Imputed", "Original"),
    Type = factor(Type, levels = c("Original", "Imputed")),
    Group = sample_list$Group[match(Sample, sample_list$Sample)],
    Group = factor(Group, levels = ordered_groups)
  )

# Imputation effect plot
medians <- df_long_imputed %>%
  filter(Type == "Imputed") %>%
  group_by(Sample) %>%
  summarise(median_intensity = median(Intensity, na.rm = TRUE))

# Faceted histogram to show distributions
hist_dist_missing_val <- ggplot(df_long_imputed, aes(x = Intensity)) +
  geom_histogram(aes(fill = Type), binwidth = 1, position = "identity", alpha = 0.8) +
  scale_fill_manual(
    name   = "Data Source",
    values = c("Original" = "cyan3", "Imputed" = "firebrick1"),
    labels = c("Original", "Imputed")
  ) +
  facet_wrap(~Sample, scales = "free_y") +
  labs(
    title = "Distribution of Imputed vs Original Values",
    x     = "Intensity",
    y     = "Frequency"
  ) +
  theme_classic(base_size = 12) +
  theme(
    strip.text = element_text(face = "bold"),
    legend.position = "right",
    legend.title    = element_text(size = 11),
    legend.text     = element_text(size = 10)
  )

# Print the histogram
print(hist_dist_missing_val)

# Save both PNG and SVG
ggsave(
  filename = "Distribution of Intensities: Original vs Imputed.png",
  plot     = hist_dist_missing_val,
  path     = export_dir,
  width    = 6,
  height   = 5,
  units    = "in",
  dpi      = 300
)
ggsave(
  filename = "Distribution of Intensities: Original vs Imputed.svg",
  plot     = hist_dist_missing_val,
  path     = export_dir,
  width    = 6,
  height   = 5,
  units    = "in",
  device   = "svg"
)

Investigate Missing Values

Click to expand: Investigate Missing Values
Click to expand: Investigate Missing Values- Heatmap

Scatter Plot Matrix

Click to expand: Multi Scatter Plot Matrix

Calculate CV- Coefficient of Variation

Click to expand: Coefficient of Variation- Violin Plot
# Join sample_list group info and clean Group
# Check critical objects
stopifnot(
  exists("df_norm_valid_imp"),  
  exists("sample_list"),        
  all(sample_list$Sample %in% colnames(df_norm_valid_imp))  
)
group_info <- sample_list$Group  
names(group_info) <- sample_list$Sample  

group_info_df <- data.frame(
  Sample = names(group_info),  
  Group = as.character(group_info)  
)

df_norm_valid_imp_long <- df_norm_valid_imp %>%  
  rownames_to_column("Protein") %>%  
  pivot_longer(
    cols = -Protein,  
    names_to = "Sample",  
    values_to = "Intensity"
  )
# Now join safely
qc_cv <- inner_join(df_norm_valid_imp_long, group_info_df, by = "Sample")

# Unlog intensity
qc_cv$IntensityUnlog <- 2^qc_cv$Intensity

# Calculate CV per protein per group
qc_cv_df <- qc_cv %>%
  group_by(Protein, Group) %>%
  summarise(cv = cv(IntensityUnlog, na.rm = TRUE), .groups = "drop") %>%
  na.omit() %>%
  mutate(Group = factor(Group, levels = ordered_groups))

# Enhanced plot
plot_violin_cv <- ggplot(qc_cv_df, aes(x = Group, y = cv, fill = Group)) +
  geom_violin(trim = FALSE, alpha = 0.7, width = 0.8, color = NA) +
  geom_boxplot(width = 0.15, fill = "white", color = "black", outlier.shape = NA) +
  scale_fill_manual(values = group_colors) +
  scale_y_continuous(expand = c(0, 0))+
  geom_hline(yintercept = 20, linetype = "dashed", color = "red", linewidth = 1) +
  labs(
  x = "Group",
  y = "Coefficient of Variation (%)",
  title = "Technical Reproducibility Across Experimental Groups",
  caption = paste("n =", format(nrow(qc_cv_df)/3), "proteins per group | CV = SD/mean * 100")
) +
  theme_classic(base_size = 14) +
  theme(
    legend.position = "none",
    plot.title = element_text(face = "bold", hjust = 0.5, size = 14),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1.1, size = 10),
    axis.title.y = element_text(size = 12),
    panel.grid.major.y = element_line(color = "gray90")
  )

print(plot_violin_cv)

# Save both PNG and SVG
ggsave(
  filename = "CV Violin Plot-Technical Reproducibility Across Experimental Groups.png",
  plot     = plot_violin_cv,
  path     = export_dir,
  width    = 6,
  height   = 5,
  units    = "in",
  dpi      = 300
)
ggsave(
  filename = "CV Violin Plot-Technical Reproducibility Across Experimental Groups.svg",
  plot     = plot_violin_cv,
  path     = export_dir,
  width    = 6,
  height   = 5,
  units    = "in",
  device   = "svg"
)

Protein CV Category Distribution

Click to expand: Protein CV Category Distribution- Bar Plot
cv_barplot <- qc_cv_df %>%
  mutate(CV_Category = case_when(
    cv < 10 ~ "< 10%",
    cv >= 10 & cv < 20 ~ "10–20%", 
    cv >= 20 & cv < 30 ~ "20–30%",
    cv >= 30 & cv <= 100 ~ "30–100%",
    TRUE ~ "> 100%"  # optional fallback
  ))

# Reorder levels
cv_barplot$CV_Category <- factor(cv_barplot$CV_Category,
                                 levels = c("< 10%", "10–20%", "20–30%", "30–100%", "> 100%"))

# Summarize by group and CV bin
cv_barplot_summary <- cv_barplot %>%
  filter(CV_Category != "> 100%") %>%
  group_by(Group, CV_Category) %>%
  summarise(Count = n(), .groups = "drop") %>%
  group_by(Group) %>%
  mutate(Percentage = Count / sum(Count) * 100)

# Define new elegant color palette
mycolors <- c(
  "< 10%"    = "#A6BDDB",
  "10–20%"   = "#74A9CF",
  "20–30%"   = "#2B8CBE",
  "30–100%"  = "#045A8D")

# Plot: Protein CV Category Distribution
CV_barplot <- ggplot(cv_barplot_summary, aes(x = Group, y = Percentage, fill = CV_Category)) +
  geom_bar(stat = "identity", width = 0.85) +
  scale_fill_manual(values = mycolors) +
  labs(
    y = "Percentage of Proteins", 
    fill = "CV Category",
    title = "Protein CV Category Distribution"
  ) +
  geom_text(
    aes(label = sprintf("%.1f%%", Percentage)), 
    position = position_stack(vjust = 0.5),
    size = 4.5, color = "white"
  ) +
  theme_classic(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5, size = 14),
    axis.text.x = element_text(angle = 90, hjust = 1, size = 10),
    axis.text.y = element_text(size = 10),
    axis.title.y = element_text(size = 12),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    legend.position = "right"
  )

CV_barplot

# Save both PNG and SVG
ggsave(
  filename = "Barplot Protein CV Category Distribution.png",
  plot     = CV_barplot,
  path     = export_dir,
  width    = 6,
  height   = 5,
  units    = "in",
  dpi      = 300
)
ggsave(
  filename = "Barplot Protein CV Category Distribution.svg",
  plot     = CV_barplot,
  path     = export_dir,
  width    = 6,
  height   = 5,
  units    = "in",
  device   = "svg"
)

QC: CV vs. Average Intensity (2D Density Plot)

Click to expand: Volcano Plot
### QC: CV vs. Mean Intensity (2D Density Plot by Group)

# Remove rows that are entirely NA
df_norm_valid_imp <- df_norm_valid_imp[rowSums(is.na(df_norm_valid_imp)) < ncol(df_norm_valid_imp), ]

# Prepare CV stats — CV calculated on raw (unlogged) intensity
qc_stats <- df_norm_valid_imp %>%
  rownames_to_column("Protein") %>%
  pivot_longer(-Protein, names_to = "Sample", values_to = "Intensity") %>%
  mutate(Intensity = 2^Intensity) %>%  # ✅ reverse log2
  left_join(sample_list, by = "Sample") %>%
  filter(!is.na(Group)) %>%  
  group_by(Protein, Group) %>%
  summarise(
    mean_intensity = mean(Intensity, na.rm = TRUE),
    sd_intensity = sd(Intensity, na.rm = TRUE),
    cv = 100 * (sd_intensity / mean_intensity),
    .groups = "drop"
  ) %>%
  filter(is.finite(cv), is.finite(mean_intensity))

# Create hexbin plot with facets
hex_plot <- ggplot(qc_stats, aes(x = cv, y = log2(mean_intensity))) +
  geom_hex(bins = 60) +
  facet_wrap(~Group) +
  scale_fill_viridis_c(option = "D") +
  labs(
    title = "2D Density Plot: CV vs Mean Intensity (by Group)",
    x = "Coefficient of Variation (CV, %)",
    y = "Mean Intensity (log2)"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5, size = 14),
    axis.text.x = element_text(size = 10),
    axis.text.y = element_text(size = 10),
    axis.title.x = element_text(size = 12),
    axis.title.y = element_text(size = 12),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    legend.position = "right"
  )

# Display the plot
print(hex_plot)

# Save both PNG and SVG
ggsave(
  filename = "2D Density Plot: CV vs Mean Intensity.png",
  plot     = hex_plot,
  path     = export_dir,
  width    = 6,
  height   = 5,
  units    = "in",
  dpi      = 300
)
ggsave(
  filename = "2D Density Plot: CV vs Mean Intensity.svg",
  plot     = hex_plot,
  path     = export_dir,
  width    = 6,
  height   = 5,
  units    = "in",
  device   = "svg"
)

T-Test

# Select only 2 groups of interest
selected_groups <- c("Prototype", "MPSP")

# Debug: rebuild group_info
group_info <- sample_list$Group
names(group_info) <- sample_list$Sample

# Filter for selected groups
group_filter <- group_info %in% selected_groups

# Subset the data
df_stat <- df_norm_valid_imp[, names(group_info)[group_filter]] %>%
  dplyr::select(!contains("imputed"))
group_info <- factor(group_info[group_filter], levels = selected_groups)

# T-test function
t.test.2 <- function(x, y) {
  tryCatch({
    if (length(unique(y)) != 2 || any(table(y) < 2)) {
      return(NA)
    }
    t.test(x ~ y)$p.value
  }, error = function(e) {
    return(NA)
  })
}

# Calculate p-values
p.values <- apply(df_stat , 1, t.test.2, group_info)
p.values.adj <- p.adjust(p.values, "BH")
proteins_stat <- cbind(df_stat, p.values, p.values.adj) %>% data.frame() %>%
  rownames_to_column(var = "Accession")

Create df_description if missing

if(!exists("df_description")) {
  gene_name <- proteinGroups_filtered$Description
  gene_name <- str_split(gene_name, pattern = "GN=")
  collected_gene_names <- sapply(gene_name, function(x) str_split(x[2], pattern = " ")[[1]][1])
  
  gene_name <- proteinGroups_filtered$Description
  gene_name <- str_split(gene_name, pattern = "OS=")
  collected_description <- sapply(gene_name, function(x) x[1])
  
  df_description <- data.frame(
    "Accession" = proteinGroups_filtered$Accession,
    "Gene" = collected_gene_names,
    "ShortDesc" = collected_description
  )
}

Summary of Processing, QC, and Statistical Analysis

The following steps were applied to the dataset before differential analysis:

  • Sample Selection:
    Focused on two groups only: Prototype and MPSP. Other groups, if present, were excluded.

  • Protein Filtering Criteria:

    • Proteins were retained if detected with at least 2 peptides.
    • Only master proteins were included.
    • Known contaminants were excluded.
    • Further filtered to include proteins quantified in at least 2 replicates within one group (min_count = 2).
  • Peptide Quality Control:
    Peptides were retained with Sequest HT XCorr ≥ 1.5.

  • Normalization Strategy:
    Applied median normalization (log2-transformed after normalization).
    Other normalization strategies (quantile, sum, batch, etc.) are available but median is used by default.

  • Imputation Strategy:
    A 2-step imputation was applied to account for missing values:

    1. Group-wise MNAR imputation: for values consistently missing in specific groups.
    2. missForest: for remaining missing-at-random values.
  • Intensity Visualization:
    Violin plots and heatmaps of protein intensities were generated before and after normalization, along with QC for missingness and CV.

  • Statistical Testing:
    Performed two-sample t-tests between the two groups.
    Adjusted p-values were computed using Benjamini-Hochberg (FDR) correction.

  • Ratio Calculation:
    Log2-transformed fold changes were computed as:
    log2(mean Intensity Prototype / mean Intensity MPSP)

  • Differential Significance Thresholds:
    Proteins are labeled “Significant” if:

    • |log2FC| > 1, and
    • adjusted p-value ≤ 0.05

Volcano Plot

Click to expand: Volcano Plot
library(ggplot2)
library(plotly)
library(ggrepel)

# Set default strict criteria
df_volcano <- calculate_ratios_volcano_p_adjust(df_norm_valid_imp)

df_volcano$Significance <- ifelse(df_volcano$p.values.adj <= 0.05 & abs(df_volcano$RatioLog) > 1,
                                  "Significant", "Not significant")

# Define color palette
volcano_colors <- c("Significant" = "#D73027", "Not significant" = "darkgray")


# Volcano plot
volcano_plot <- ggplot(df_volcano, aes(x = RatioLog, y = -log10(p.values.adj),
                                     color = Significance,
                                     text = paste0("Gene: ", Gene, "<br>Log2FC: ", round(RatioLog, 2),
                                                   "<br>adj.p: ", signif(p.values.adj, 3)))) +
  geom_point(alpha = 0.7, size = 0.8) +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "black") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +
  scale_color_manual(values = volcano_colors) +
  labs(
    title = paste0("Volcano Plot: ", GroupA, " vs ", GroupB),
    x = paste0("Log2(", ratio_name, ")"),
    y = "-log10(Adjusted p-value)") +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5, size = 14),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10),
    plot.caption = element_text(size = 10, face = "italic"),
    legend.position = "right"
  )


# Convert to interactive
volcano_interactive <- plotly::ggplotly(volcano_plot, tooltip = "text")
volcano_interactive
# Save both PNG and SVG
ggsave(
  filename = "volcano_plot.png",
  plot     = volcano_plot,
  path     = export_dir,
  width    = 6,
  height   = 5,
  units    = "in",
  dpi      = 300
)
ggsave(
  filename = "volcano_plot.svg",
  plot     = volcano_plot,
  path     = export_dir,
  width    = 6,
  height   = 5,
  units    = "in",
  device   = "svg"
)

Export Results

openxlsx::write.xlsx(x = df_volcano, file = "proteins_df_stat.xlsx")
save.image(file = paste0(Sys.Date(), "_NewStandardReport.RData"))

Summary

This report documents the complete data analysis pipeline for the two conditions mass spectrometry dataset, from data loading and filtering to normalization, imputation, statistical testing, and visualization.

Report generated on r Sys.Date() using RMarkdown.